Text analysis
## SOURCE_TYPE Avg STD min max
## 1: instagram 340.85211 26.48027 143 362
## 2: reddit 274.77670 103.15160 20 356
## 3: twitter 261.44314 89.66503 84 686
## 4: facebook 150.19545 21.41469 20 172
## 5: web 79.92884 30.07199 1 177
## 6: youtube 56.72377 23.45628 9 133
## 7: forum 39.34595 20.52029 4 127
## 8: comment 22.00000 0.00000 22 22
## SOURCE_TYPE Avg STD min max
## 1: web 4168.3613 5050.17995 86 33436
## 2: forum 1411.3813 1490.12476 48 9651
## 3: youtube 1031.4136 927.56866 31 5396
## 4: reddit 909.1942 1630.09003 70 11632
## 5: instagram 783.6479 455.22641 151 2269
## 6: facebook 701.3283 1001.06685 20 32409
## 7: comment 334.8745 301.80986 34 1556
## 8: twitter 261.4431 89.66503 84 686
## [1] 40578 49
## [1] 22212336 49
## [1] 14003636 49
Basic freq
## word N
## 1: ukrajini 50545
## 2: ukrajine 41509
## 3: ruske 41269
## 4: ukrajinu 34419
## 5: ljudi 33801
## ---
## 287472: nestašlucima 1
## 287473: šaravanji 1
## 287474: biško 1
## 287475: puteševica 1
## 287476: nekdašnja 1



Sentiment



## # A tibble: 1,145 × 2
## FROM negat…¹
## <chr> <dbl>
## 1 "RTL.hr Sport" 16.7
## 2 "Geopolitika.news" 14.3
## 3 "Prolaznik_Slučajni" 11.1
## 4 "Visokoin.com" 11.1
## 5 "Profitiraj.hr" 8.70
## 6 "HRT OTVORENO" 8.42
## 7 "Damir Vucić \U0001f1ed\U0001f1f7\U0001f1fa\U0001f1f8 \U0001f64f\U00… 7.69
## 8 "Fran Papac" 7.69
## 9 "Ivan Kutlesa" 7.69
## 10 "Ivana Cindrić" 7.69
## # … with 1,135 more rows, and abbreviated variable name ¹negativnostIndex
## # A tibble: 1,732 × 2
## FROM pozitiv…¹
## <chr> <dbl>
## 1 "liburnija" 22.2
## 2 "KAportal" 18.8
## 3 "Mersed Hasana Trakic" 16.7
## 4 "Vranjska Plus" 16.7
## 5 "Požeška biskupija" 14.7
## 6 "Glas Istre HR" 14.3
## 7 "A.S.D. Settalese" 13.3
## 8 "C60. \U0001f1ed\U0001f1f7\U0001f1fa\U0001f1e6\U0001f1ea\U0001f1fa" 13.3
## 9 "vinyldaewo" 13.2
## 10 "Tihomir Mastelic-Ivic" 12.8
## # … with 1,722 more rows, and abbreviated variable name ¹pozitivnostIndex
Analysis of most liked posts
## [1] 1059 49
## FROM N
## 1: index.hr 158
## 2: jutarnji.hr 131
## 3: 24sata.hr 102
## 4: dnevnik.hr 58
## 5: slobodnadalmacija.hr 57
## ---
## 162: SDP Hrvatske 1
## 163: dubrovackiportal.hr 1
## 164: fightsite.hr 1
## 165: hcl.hr 1
## 166: Osječki taxi 1
## FROM LIKES
## 1: 24sata.hr 260735
## 2: index.hr 215720
## 3: jutarnji.hr 167645
## 4: slobodnadalmacija.hr 102494
## 5: dnevnik.hr 68541
## ---
## 162: SDP Hrvatske 523
## 163: dubrovackiportal.hr 520
## 164: fightsite.hr 517
## 165: hcl.hr 514
## 166: Osječki taxi 500
## [1] 788444 49
## word N
## 1: ukrajini 1615
## 2: ruske 1519
## 3: ljudi 1275
## 4: ukrajinu 1275
## 5: ukrajine 1247
## ---
## 60216: brodskih 1
## 60217: eskadrila 1
## 60218: razarača 1
## 60219: fregatu 1
## 60220: pomorskim 1

Term importance
## Udio riječi po domenama
domenaWords <- fb_tokenTidy_TopLike %>%
filter(FROM %in% c("24sata", "jutarnji.hr", "slobodnadalmacija.hr", "Index.hr" )) %>%
count(FROM, word, sort = T)
ukupnoWords <- domenaWords %>%
group_by(FROM) %>%
summarise(totWords = sum(n))
domenaWords <- left_join(domenaWords, ukupnoWords)
# domenaWords %>% head(15)
# domenaWords %>%
# ggplot(., aes(n/totWords, fill = domena)) +
# geom_histogram(show.legend = FALSE) +
# xlim(NA, 0.0009) +
# facet_wrap(~domena, ncol = 2, scales = "free_y")
## Najbitnije riječi po domenma
idf <- domenaWords %>%
bind_tf_idf(word, FROM, n)
#idf %>% head(10)
# idf %>%
# select(-totWords) %>%
# arrange(desc(tf_idf))
idf %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
mutate(FROM = factor(FROM)) %>%
group_by(FROM) %>%
top_n(11) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = FROM)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~FROM, ncol = 2, scales = "free") +
coord_flip() +
theme_economist()

Phrases
fb_bigram <- fb_TopLike %>%
unnest_tokens(bigram, FULL_TEXT, token = "ngrams", n = 2)
#fb_bigram %>% head(10)
# fb_bigram %>%
# count(bigram, sort = T) %>%
# head(25)
fb_bigram_sep <- fb_bigram %>%
separate(bigram, c("word1","word2"), sep = " ")
fb_bigram_tidy <- fb_bigram_sep %>%
filter(!word1 %in% stop_corpus$word) %>%
filter(!word2 %in% stop_corpus$word) %>%
mutate(word1 = gsub("\\d+", NA, word1)) %>%
mutate(word2 = gsub("\\d+", NA, word2)) %>%
mutate(word1 = gsub("^[a-zA-Z]$", NA, word1)) %>%
mutate(word2 = gsub("^[a-zA-Z]$", NA, word2))
fb_bigram_tidy_bigram_counts <- fb_bigram_tidy %>%
count(word1, word2, sort = TRUE)
bigrams_united <- fb_bigram_tidy %>%
unite(bigram, word1, word2, sep = " ") %>%
filter(., !grepl("NA",bigram))
#bigrams_united
bigrams_united %>%
count(FROM,bigram,sort = T) -> topicBigram
bigrams_united %>%
count(bigram, sort = T) %>%
head(45)
## bigram n
## 1 ruske snage 390
## 2 vanjskih poslova 337
## 3 ukrajinski predsjednik 241
## 4 ruske invazije 230
## 5 vladimir putin 217
## 6 ministarstvo obrane 190
## 7 predsjednik volodimir 190
## 8 pročitajte ovdje 168
## 9 volodimir zelenski 162
## 10 ruska vojska 153
## 11 milijuna kuna 134
## 12 ministar vanjskih 133
## 13 ruski predsjednik 132
## 14 rusko ministarstvo 127
## 15 predsjednik vladimir 126
## 16 ukrajinske snage 125
## 17 društvenim mrežama 124
## 18 ukrajinska vojska 121
## 19 nekoliko dana 119
## 20 ispod oglasa 112
## 21 nastavlja ispod 112
## 22 humanitarne pomoći 109
## 23 ruske trupe 109
## 24 vladimira putina 109
## 25 protiv rusije 103
## 26 ruske federacije 102
## 27 crvenog križa 99
## 28 humanitarnu pomoć 98
## 29 ruskih vojnika 95
## 30 humanitarnu akciju 93
## 31 europske unije 89
## 32 ministarstvo vanjskih 89
## 33 ratne zločine 89
## 34 sjedinjene države 89
## 35 godinu dana 88
## 36 svaki dan 88
## 37 ukrajinskog predsjednika 88
## 38 ruskih snaga 85
## 39 volodimir zelenskij 80
## 40 ruske agresije 79
## 41 ministar obrane 78
## 42 oružane snage 78
## 43 mjesec dana 77
## 44 novinska agencija 77
## 45 ukrajinski ministar 77
# Najvažniji bigrami po domenama
bigram_tf_idf <- bigrams_united %>%
# filter (!is.na(bigram)) %>%
count(FROM, bigram) %>%
bind_tf_idf(bigram, FROM, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf %>%
filter(FROM %in% c("24sata", "jutarnji.hr", "slobodnadalmacija.hr", "Index.hr")) %>%
arrange(desc(tf_idf)) %>%
mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>%
group_by(FROM) %>%
top_n(20) %>%
ungroup() %>%
ggplot(aes(bigram, tf_idf, fill = FROM)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~FROM, ncol = 2, scales = "free") +
coord_flip() +
theme_economist()

PHRASES CORRELATION
fb_tokenTidy %>%
# filter(datum > "2020-02-20") %>%
group_by(word) %>%
filter(n() > 200) %>%
filter(!is.na(word)) %>%
pairwise_cor(word,DATE, sort = T) -> corsWords
#corsWords %>%
# filter(item1 == "oporavak")
corsWords %>%
filter(item1 %in% c("kupnja", "akcija", "poklon")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip() +
theme_economist()
TEMATIC ANALYSIS
fb_tokenTidy_TopLike %>%
count(FROM, word, sort = TRUE) %>%
cast_dtm(FROM, word,n) -> dtm
fb_LDA <- LDA(dtm, k = 4, control = list(seed = 1234))
fb_LDA_tidy <- tidy(fb_LDA, matrix = "beta")
#newsCOVID_LDA_tidy
insta_terms <- fb_LDA_tidy %>%
drop_na(.) %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
arrange(topic, -beta)
#newsCOVID_terms
insta_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme_economist()
